In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
In [2]:
# Data: https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes
# Input Features: ['preg_count', 'glucose_concentration', 'diastolic_bp',
# 'triceps_skin_fold_thickness', 'two_hr_serum_insulin', 'bmi',
# 'diabetes_pedi', 'age']
# Target: 'diabetes_class'. 1 => Diabetic. 0 => Normal.
# Objective: Predict probability of diabetes
# Actual Positives: 268 (diabetic)
# Actual Negatives: 500 (normal)
# Diabetes Dataset Size: 768 samples
# Training + Eval set: 710 samples
# Test set: 58 samples
In [3]:
data_path = r'..\Data\ClassExamples\DiabetesData'
In [4]:
df = pd.read_csv(os.path.join(data_path, 'pima-indians-diabetes.data.txt'))
In [5]:
df.columns
Out[5]:
In [6]:
df.shape
Out[6]:
In [7]:
df.head()
Out[7]:
In [8]:
df.diabetes_class.value_counts()
Out[8]:
In [9]:
df.corr()
Out[9]:
In [10]:
temp_diabetic = df.diabetes_class == 1
In [11]:
# Diabetic glucose concentration histogram
fit = plt.figure(figsize = (12, 8))
plt.hist(df[temp_diabetic].glucose_concentration)
plt.title('diabetic - glucose')
plt.xlabel('Glucose Level')
plt.ylabel('Count')
Out[11]:
In [12]:
# Diabetic glucose concentration histogram
fit = plt.figure(figsize = (12, 8))
plt.hist(df[~temp_diabetic].glucose_concentration)
plt.title('normal - glucose')
plt.xlabel('Glucose Level')
plt.ylabel('Count')
Out[12]:
In [13]:
fit = plt.figure(figsize = (12, 8))
plt.hist(df[temp_diabetic].bmi)
plt.xlabel('bmi')
plt.ylabel('count')
plt.title('diabetic - bmi')
Out[13]:
In [14]:
fit = plt.figure(figsize = (12, 8))
plt.hist(df[~temp_diabetic].bmi)
plt.xlabel('bmi')
plt.ylabel('count')
plt.title('normal - bmi')
Out[14]:
In [15]:
fit = plt.figure(figsize = (12, 8))
plt.hist(df[temp_diabetic].age)
plt.xlabel('age')
plt.ylabel('count')
plt.title('diabetic - age')
Out[15]:
In [16]:
fit = plt.figure(figsize = (12, 8))
plt.hist(df[~temp_diabetic].age)
plt.xlabel('age')
plt.ylabel('count')
plt.title('normal - age')
Out[16]:
In [17]:
fit = plt.figure(figsize = (12, 8))
plt.hist([df[temp_diabetic].age,
df[~temp_diabetic].age],
label = ['diab','normal'])
plt.xlabel('Age')
plt.ylabel('count')
plt.title('Age')
plt.legend()
Out[17]:
In [18]:
df_train_eval = df.iloc[range(0, 710)]
df_test_eval = df.iloc[range(710, 768)]
In [19]:
df_train_eval.to_csv(os.path.join(data_path, 'diabetes_data_train_710samples.csv'),
index = True,
index_label = 'Row')
In [20]:
df_test_eval.to_csv(os.path.join(data_path, 'diabetes_data_eval_58samples.csv'),
index = True,
index_label = 'Row')
In [21]:
# export all columns except for target attribute
df.to_csv(os.path.join(data_path, 'diabetest_data_test_all.csv'),
index = True,
index_label = 'Row',
columns = df.columns[:-1])
In [22]:
# Predicted Output
df_predicted = pd.read_csv(os.path.join(data_path,
'output',
'bp-dHhriWXAJNj-diabetest_data_test_all.csv.gz'))
In [23]:
df_predicted.head()
Out[23]:
In [24]:
diab_table = pd.crosstab(
df.diabetes_class,
df_predicted.bestAnswer,
rownames = ['Actual'],
colnames=['Predicted'])
In [25]:
# Important to build a confusion matrix or contingency matrix
# Shows how many were correctly classified and how many mis-classification occured
# Helpful when positive samples are small in number
diab_table
Out[25]:
In [26]:
fit = plt.figure(figsize = (12, 8))
plt.bar([0,1],
diab_table.iloc[0],
width = .35,
label = 'Predicted Normal',
color ='g')
plt.bar([0,1],
diab_table.iloc[1],
width = .35,
color = 'b',
label = 'Predicted Diabetic',
bottom = diab_table.ix[0])
plt.ylabel('Predicted')
plt.xticks([0.2, 1.2], ('Actual Normal', 'Actual Diabetic'))
plt.grid()
plt.legend()
Out[26]:
AUC is a curve formed by plotting True Positive Rate against False Positive Rate at different cut-off threshholds.
In [27]:
# Example to show how to compute metrics.
# AWS ML Provides all these metrics under evaluation
actual_negative = df.diabetes_class.value_counts()[0]
actual_positive = df.diabetes_class.value_counts()[1]
actual_count = actual_negative + actual_positive
In [28]:
actual_negative, actual_positive
Out[28]:
In [29]:
true_negative = diab_table.iloc[0][0]
false_positive = diab_table.iloc[0][1]
true_positive = diab_table.iloc[1][1]
false_negative = diab_table.iloc[1][0]
In [30]:
diab_table
Out[30]:
In [31]:
true_negative, false_positive
Out[31]:
In [32]:
true_positive, false_negative
Out[32]:
In [33]:
# Accuracy - larger value indicates better predictive accuracy
# How many were correctly classified?
accuracy = (true_negative + true_positive) / actual_count
print('Accuracy = {0:3.2f}'.format(accuracy))
In [34]:
# True Positive Rate (also known as Recall) - larger value indicates better predictive accuracy
# Out of all positive, how many were correctly predicted as positive
tpr = true_positive / actual_positive
print('Probability of detection. TPR = {0:3.2f}'.format(tpr))
In [35]:
# False Positive Rate - smaller value indicates better predictive accuracy
# Out of all negatives, how many were incorrectly predicted as positive
fpr = false_positive / actual_negative
print('Probability of false alarm. FPR = {0:3.2f}'.format(fpr))
In [36]:
# Precision - out of all predicted as positive, how many are true positive?
# Larger value indicates better predictive accuracy
precision = true_positive / (true_positive + false_positive)
print('Precision = {0:3.2f}'.format(precision))
AUC is the area of a curve formed by plotting True Positive Rate against False Positive Rate at different cut-off thresholds.
Advanced Metrics